{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "colab_type": "text",
    "id": "view-in-github"
   },
   "source": [
    "<a href=\"https://colab.research.google.com/github/CoreTheGreat/HBPU-Machine-Learning-Course/blob/main/ML_Chapter3_Classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "lPboLx_o0UxI"
   },
   "source": [
    "# 第三章：分类\n",
    "湖北理工学院《机器学习》课程资料\n",
    "\n",
    "作者：李辉楚吴\n",
    "\n",
    "笔记内容概述: 精确率 Precision、召回率 Recall、F1 Score\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Ifpm7Sql4U09"
   },
   "source": [
    "## Step 1: 数据准备\n",
    "\n",
    "使用make_moons生成虚拟数据构建分类任务"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000
    },
    "id": "sM-ziKb94S9_",
    "outputId": "9a8bd59b-1ed4-47e3-e118-cee1849a610a"
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import make_moons\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "label_size = 18 # Label size\n",
    "ticklabel_size = 14 # Tick label size\n",
    "\n",
    "# Generate moon-shaped data\n",
    "X, Y = make_moons(n_samples=1000, noise=0.4, random_state=42)\n",
    "\n",
    "# Split X and Y into training and testing sets\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)\n",
    "\n",
    "# Plot the data\n",
    "fig, ax = plt.subplots(figsize=(10, 8))\n",
    "ax.scatter(X_train[y_train == 1, 0], X_train[y_train == 1, 1], color='red', label='Positive - Train')\n",
    "ax.scatter(X_test[y_test == 1, 0], X_test[y_test == 1, 1], color='orange', label='Positive - Test')\n",
    "ax.scatter(X_train[y_train == 0, 0], X_train[y_train == 0, 1], color='blue', label='Negative - Train')\n",
    "ax.scatter(X_test[y_test == 0, 0], X_test[y_test == 0, 1], color='green', label='Negative - Test')\n",
    "ax.set_xlabel('Feature 1', fontsize=label_size)\n",
    "ax.set_ylabel('Feature 2', fontsize=label_size)\n",
    "\n",
    "ax.tick_params(axis='both', which='major', labelsize=ticklabel_size)\n",
    "\n",
    "# Set legend fontsize\n",
    "plt.legend(prop={'size': 14})\n",
    "\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: 使用逻辑回归和SVM进行分类，输出概率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BWPSDyOq5qOO",
    "outputId": "9aed06f2-67a9-4a6f-f00c-d91554c3a260"
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "\n",
    "# Model Definition\n",
    "mdl_lr = LogisticRegression() # Logistic Regression\n",
    "mdl_svm = SVC(probability=True) # Support Vector Machine, set probability to draw P-R Curve\n",
    "\n",
    "# Model Training\n",
    "mdl_lr.fit(X_train, y_train) # Logistic Regression\n",
    "print('Logistic regression model trained successfully...')\n",
    "\n",
    "mdl_svm.fit(X_train, y_train) # Support Vector machine\n",
    "print('Support vector machine trained successfully...')\n",
    "\n",
    "# Get confusion matrix of all models\n",
    "# Predict probabilities for each model\n",
    "y_proba_lr = mdl_lr.predict_proba(X_test)[:, 1] # Using probability of positive class\n",
    "y_proba_svm = mdl_svm.predict_proba(X_test)[:, 1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "BtGNbqlSKnC-"
   },
   "source": [
    "## Step 3: 调整阈值，改变精确率和召回率"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def confusion_matrix(y_pred, y):\n",
    "    '''\n",
    "    y_pred - predict classes\n",
    "    y - actual classes\n",
    "    '''\n",
    "    # Get the number of classes\n",
    "    n_classes = len(np.unique(y))\n",
    "    \n",
    "    # Initialize the confusion matrix with zeros\n",
    "    cm = np.zeros((n_classes, n_classes))\n",
    "    \n",
    "    # Fill the confusion matrix\n",
    "    for pred, actual in zip(y_pred, y):\n",
    "        cm[pred][actual] += 1\n",
    "    \n",
    "    return cm\n",
    "\n",
    "def get_prediction_from_proba(y_proba, threshold=0.5):\n",
    "    '''\n",
    "    y_proba - predict probabilities\n",
    "    threshold - threshold of probability\n",
    "    \n",
    "    y_pred - predict classes, y_proba > threshold is positive, otherwise negative\n",
    "    '''\n",
    "    y_pred = np.where(y_proba > threshold, 1, 0)\n",
    "    return y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_precision_recall_f1(y_pred, y):\n",
    "    '''\n",
    "    Compute precision, recall, and F1 score of binary classification\n",
    "    y_pred - predict classes\n",
    "    y - actual classes\n",
    "    '''\n",
    "    epsilon = 1e-10\n",
    "    \n",
    "    # Compute confusion matrix\n",
    "    cm = confusion_matrix(y_pred, y)\n",
    "    \n",
    "    # Compute precision, recall, and F1 score\n",
    "    precision = cm[1, 1] / (cm[1, 1] + cm[1,0] + epsilon)\n",
    "    recall = cm[1, 1] / (cm[1, 1] + cm[0, 1] + epsilon)\n",
    "    f1 = 2 * (precision * recall) / (precision + recall + epsilon)\n",
    "    \n",
    "    return precision, recall, f1\n",
    "\n",
    "# Get predictions from probabilities\n",
    "for threshold in np.linspace(0, 1, 11):\n",
    "    y_pred_lr = get_prediction_from_proba(y_proba_lr, threshold)\n",
    "    precision_lr, recall_lr, f1_lr = get_precision_recall_f1(y_pred_lr, y_test)\n",
    "    print(f\"When threshold = {threshold:0.1f} - Precision: {precision_lr:.4f}, Recall: {recall_lr:.4f}, F1 Score: {f1_lr:.4f}\")\n",
    "\n",
    "print(confusion_matrix(y_pred_lr, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 4: 调整Threshold，绘制P-R曲线图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import precision_recall_curve\n",
    "\n",
    "# Calculate precision and recall for various thresholds\n",
    "precision_lr, recall_lr, thresholds_lr = precision_recall_curve(y_test, y_proba_lr)\n",
    "precision_svm, recall_svm, thresholds_svm = precision_recall_curve(y_test, y_proba_svm)\n",
    "\n",
    "# Create the P-R curve of Logistic Regression and Support Vector Machine\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "ax.plot(recall_lr, precision_lr, marker='.', color='tab:blue', label='Model 2')\n",
    "ax.plot(recall_svm, precision_svm, marker='.', color='tab:orange', label='Model 3')\n",
    "ax.set_xlabel('Recall', fontsize=label_size)\n",
    "ax.set_ylabel('Precision', fontsize=label_size)\n",
    "ax.tick_params(axis='both', which='major', labelsize=ticklabel_size)\n",
    "plt.legend(fontsize=ticklabel_size)\n",
    "\n",
    "# Add some threshold annotations of Logistic Regression\n",
    "for i in [0, int(len(thresholds_lr)/2), len(thresholds_lr)-1]:\n",
    "    plt.annotate(f'Threshold: {thresholds_lr[i]:.2f}', \n",
    "                    xy=(recall_lr[i]*0.8, thresholds_lr[i]), \n",
    "                    xytext=(5, 5), \n",
    "                    textcoords='offset points',\n",
    "                    fontsize=ticklabel_size)\n",
    "\n",
    "plt.savefig('P-R_Curve.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate F1 scores for SVM model\n",
    "f1_svm = 2 * (precision_svm * recall_svm) / (precision_svm + recall_svm)\n",
    "\n",
    "# Drawing a curve with Recall as x-axis and F1-score as y-axis\n",
    "fig, ax = plt.subplots(figsize=(8, 6))\n",
    "ax.plot(recall_svm, f1_svm, marker='.', color='tab:orange', label='Model 3')\n",
    "ax.set_xlabel('Recall', fontsize=label_size)\n",
    "ax.set_ylabel('F1 Score', fontsize=label_size)\n",
    "ax.tick_params(axis='both', which='major', labelsize=ticklabel_size)\n",
    "\n",
    "# Add some threshold annotations of SVM where F1-score is maximum\n",
    "max_f1_index = np.argmax(f1_svm)\n",
    "plt.annotate(f'Threshold: {thresholds_svm[max_f1_index]:.2f}', \n",
    "                    xy=(recall_svm[max_f1_index]*0.8, f1_svm[max_f1_index]), \n",
    "                    xytext=(5, 5), \n",
    "                    textcoords='offset points',\n",
    "                    fontsize=ticklabel_size)\n",
    "\n",
    "# Adding a dashed line to indicate the maximum F1 score\n",
    "plt.axvline(x=recall_svm[max_f1_index], color='tab:orange', linestyle='--')\n",
    "plt.axhline(y=f1_svm[max_f1_index], color='tab:orange', linestyle='--')\n",
    "\n",
    "# Adding text annotations for maximum F1 score\n",
    "plt.text(recall_svm[max_f1_index]-0.8, f1_svm[max_f1_index]-0.05, f'Max F1 Score: {f1_svm[max_f1_index]:.4f}', \n",
    "         horizontalalignment='left', verticalalignment='bottom', fontsize=ticklabel_size, color='tab:orange')\n",
    "\n",
    "# Adding text annotations below x-axis of Recall where F1-score is maximum\n",
    "plt.text(recall_svm[max_f1_index]-0.3, -0.05, f'Recall: {recall_svm[max_f1_index]:.4f}', \n",
    "         horizontalalignment='left', verticalalignment='bottom', fontsize=ticklabel_size, color='tab:orange')\n",
    "\n",
    "plt.savefig('F1_Curve.png', dpi=300)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "--------------------------------------------------------------------------------------------------\n",
    "实验5 分类模型的评估方法\n",
    "一、\t实验目的\n",
    "1.\t了解常见的分类模型评估标准\n",
    "2.\t能够绘制混淆矩阵、P-R图和ROC图\n",
    "3.\t可根据数据情况和场景需求选择合适的标准评估分类模型\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.datasets import make_classification\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, f1_score, classification_report\n",
    "\n",
    "# 1. 生成虚拟数据集\n",
    "def generate_data():\n",
    "    # 二分类数据集\n",
    "    X_binary, y_binary = make_classification(n_samples=1000, n_features=10, \n",
    "                                             n_informative=8, n_redundant=2, \n",
    "                                             random_state=42)\n",
    "    # 多分类数据集\n",
    "    X_multi, y_multi = make_classification(n_samples=1000, n_features=10, \n",
    "                                           n_informative=8, n_redundant=2, \n",
    "                                           n_classes=3, random_state=42)\n",
    "    return (X_binary, y_binary), (X_multi, y_multi)\n",
    "\n",
    "(X_binary, y_binary), (X_multi, y_multi) = generate_data()\n",
    "\n",
    "# 2. 数据集划分\n",
    "X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(\n",
    "    X_binary, y_binary, test_size=0.3, random_state=42)\n",
    "X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(\n",
    "    X_multi, y_multi, test_size=0.3, random_state=42)\n",
    "\n",
    "# 3. 训练模型\n",
    "def train_models(X_train, y_train, model_type=\"binary\"):\n",
    "    if model_type == \"binary\":\n",
    "        models = {\n",
    "            \"Logistic Regression\": LogisticRegression(),\n",
    "            \"SVM\": SVC(probability=True)\n",
    "        }\n",
    "    else:\n",
    "        models = {\n",
    "            \"Decision Tree\": DecisionTreeClassifier(),\n",
    "            \"Random Forest\": RandomForestClassifier()\n",
    "        }\n",
    "    \n",
    "    trained_models = {}\n",
    "    for name, model in models.items():\n",
    "        model.fit(X_train, y_train)\n",
    "        trained_models[name] = model\n",
    "    return trained_models\n",
    "\n",
    "binary_models = train_models(X_train_binary, y_train_binary, model_type=\"binary\")\n",
    "multi_models = train_models(X_train_multi, y_train_multi, model_type=\"multi\")\n",
    "\n",
    "# 4. 绘制混淆矩阵\n",
    "def plot_confusion_matrix(y_true, y_pred, title):\n",
    "    cm = confusion_matrix(y_true, y_pred)\n",
    "    plt.imshow(cm, cmap=\"Blues\")\n",
    "    plt.title(title)\n",
    "    plt.colorbar()\n",
    "    plt.xlabel(\"Predicted\")\n",
    "    plt.ylabel(\"Actual\")\n",
    "    plt.show()\n",
    "    return cm\n",
    "\n",
    "# 5. P-R曲线\n",
    "def plot_precision_recall_curve(model, X_test, y_test, model_name):\n",
    "    y_probs = model.predict_proba(X_test)[:, 1]  # 取正类概率\n",
    "    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)\n",
    "    \n",
    "    plt.plot(recall, precision, label=model_name)\n",
    "    plt.title(f\"Precision-Recall Curve: {model_name}\")\n",
    "    plt.xlabel(\"Recall\")\n",
    "    plt.ylabel(\"Precision\")\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "# 6. F1-Recall曲线\n",
    "def plot_f1_recall_curve(y_test, y_probs):\n",
    "    precision, recall, thresholds = precision_recall_curve(y_test, y_probs)\n",
    "    f1_scores = [f1_score(y_test, y_probs >= t) for t in thresholds]\n",
    "    \n",
    "    plt.plot(recall[:-1], f1_scores, label=\"F1-Recall\")\n",
    "    plt.title(\"F1-Recall Curve\")\n",
    "    plt.xlabel(\"Recall\")\n",
    "    plt.ylabel(\"F1 Score\")\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "\n",
    "# 7. ROC曲线和AUC\n",
    "def plot_roc_curve(model, X_test, y_test, model_name):\n",
    "    y_probs = model.predict_proba(X_test)[:, 1]\n",
    "    fpr, tpr, _ = roc_curve(y_test, y_probs)\n",
    "    roc_auc = auc(fpr, tpr)\n",
    "    \n",
    "    plt.plot(fpr, tpr, label=f\"{model_name} (AUC = {roc_auc:.2f})\")\n",
    "    plt.title(\"ROC Curve\")\n",
    "    plt.xlabel(\"False Positive Rate\")\n",
    "    plt.ylabel(\"True Positive Rate\")\n",
    "    plt.legend()\n",
    "    plt.show()\n",
    "    \n",
    "    return roc_auc\n",
    "\n",
    "# Example: 二分类逻辑回归\n",
    "logistic_model = binary_models[\"Logistic Regression\"]\n",
    "y_pred_binary = logistic_model.predict(X_test_binary)\n",
    "plot_confusion_matrix(y_test_binary, y_pred_binary, \"Confusion Matrix - Logistic Regression\")\n",
    "plot_precision_recall_curve(logistic_model, X_test_binary, y_test_binary, \"Logistic Regression\")\n",
    "roc_auc = plot_roc_curve(logistic_model, X_test_binary, y_test_binary, \"Logistic Regression\")\n",
    "print(f\"AUC for Logistic Regression: {roc_auc:.2f}\")\n"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "authorship_tag": "ABX9TyO5gS9/MePw+FDiXJA07L6y",
   "include_colab_link": true,
   "provenance": []
  },
  "kernelspec": {
   "display_name": "machinelearning",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
